import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import geopandas as gpd
import json
from shapely.geometry import Point
from IPython.display import Image, display
import seaborn as sns
from pysal.lib import weights
from libpysal.weights import w_subset
from pysal.explore import esda
from numpy.random import seed
from pysal.viz import splot
from splot.esda import plot_moran
from splot import esda as esdaplot
import contextily
from sklearn.neighbors import NearestNeighbors
# Import preprocessed data
folder_path = "data/final_processed_data/"
neighborhood_stats = gpd.read_file(folder_path + "neighborhood_stats.gpkg")
airbnb_gdf = gpd.read_file(folder_path + "airbnb_listings.gpkg")
airbnb_prices = gpd.read_file(folder_path + "airbnb_prices.gpkg")
airbnb_prices_entirespace = gpd.read_file(folder_path + "airbnb_prices_entirespace.gpkg")
airbnb_prices_entirespace_highseason = gpd.read_file(folder_path + "airbnb_prices_entirespace_highseason.gpkg")
airbnb_prices_entirespace_lowseason = gpd.read_file(folder_path + "airbnb_prices_entirespace_lowseason.gpkg")
poi_gdf = gpd.read_file(folder_path + "tourism_pois.gpkg")
museums_gdf = gpd.read_file(folder_path + "museum_pois.gpkg")
galleries_gdf = gpd.read_file(folder_path + "gallery_pois.gpkg")
monuments_gdf = gpd.read_file(folder_path + "monument_pois.gpkg")
buurten_gdf = gpd.read_file(folder_path + "neighborhood_polygons.gpkg")
if neighborhood_stats.crs != "EPSG:28992" or airbnb_gdf.crs != "EPSG:28992" or airbnb_prices.crs != "EPSG:28992" or airbnb_prices_entirespace.crs != "EPSG:28992" or airbnb_prices_entirespace_highseason.crs != "EPSG:28992" or airbnb_prices_entirespace_lowseason.crs != "EPSG:28992" or poi_gdf.crs != "EPSG:28992" or museums_gdf.crs != "EPSG:28992" or galleries_gdf.crs != "EPSG:28992" or monuments_gdf.crs != "EPSG:28992" or buurten_gdf.crs != "EPSG:28992":
print("Convert all datasets to EPSG:28992")
else:
print("All dataset CRS are set to EPSG:28992")
All dataset CRS are set to EPSG:28992
Determining k-value for K-Nearest Neighbors¶
When considering a suitable k-value range for our autocorrelation analysis, we can find an optimal range where each increment in k yields the smallest change in median and maximum distance for each neighborhood polygon in the city of Amsterdam. The k values with the smallest chanegs between increments is when k=3 to k=12, so we can use this range to test for the most optimal Moran's I value when computing the global autocorrelation statistic, as defined in the function which we can deploy for each attribute we test in the analysis.
# Determine best k value for KNN
coords = np.array(list(zip(neighborhood_stats.geometry.centroid.x, neighborhood_stats.geometry.centroid.y)))
for k in range(1, 16):
nn = NearestNeighbors(n_neighbors=k+1).fit(coords) # +1 includes self
dists, _ = nn.kneighbors(coords)
kth = dists[:, -1] # distance to k-th neighbor
print(f"k={k}: median dist={np.median(kth):.0f}, max dist={np.max(kth):.0f}")
#k_value = 5
def set_k_value(df, attr, lowest_k, highest_k):
for k in range(lowest_k, highest_k+1):
w_k = weights.KNN.from_dataframe(df, k=k)
w_k.transform = "R"
mask = df[attr].notna()
valid_ids = df.index[mask].tolist()
w_sub = w_subset(w_k, valid_ids)
w_sub.transform = "R"
y = df.loc[w_sub.id_order, attr].to_numpy()
moran = esda.Moran(y, w_sub)
print(f"k={k:2d} -> Moran's I = {moran.I:.3f}, p = {moran.p_sim:.3f}")
#print("")
#print("Best k-value for KNN based on dataset polygons: " + str(k_value))
k=1: median dist=341, max dist=2625 k=2: median dist=422, max dist=3343 k=3: median dist=500, max dist=3896 k=4: median dist=555, max dist=4153 k=5: median dist=628, max dist=4405 k=6: median dist=679, max dist=4548 k=7: median dist=715, max dist=4681 k=8: median dist=768, max dist=5040 k=9: median dist=803, max dist=5060 k=10: median dist=854, max dist=5111 k=11: median dist=885, max dist=5325 k=12: median dist=931, max dist=5353 k=13: median dist=968, max dist=5505 k=14: median dist=1018, max dist=5542 k=15: median dist=1058, max dist=5578
Defining functions to perform autocorrelation analysis¶
Since there are multiple angles to investigate regarding the airbnb and tourism spatial datasets, it will be useful to define functions to do so and repeat them later by changing the input parameters. This way it will be easier to compare all airbnbs vs only airbnb listing the entire space, to compare high season vs low season, to compare specifix tourism POIs like museums, etc.
def prep_data(df, k_value):
# copy dataframe
gdf = df.copy()
# Generate W from the GeoDataFrame
w = weights.KNN.from_dataframe(gdf, k=k_value)
# Row-standardization
w.transform = "R"
return gdf, w
def spatial_lag(df, att_col, att_lag_col, weight_matrix, k_value):
df[att_lag_col] = weights.spatial_lag.lag_spatial(weight_matrix, df[att_col])
return df
def plot_spatial_lag(df, att_col, att_lag_col, k_value, title, figsizew=12, figsizeh=6, colormap="plasma", scheme="quantiles", edgecolor="grey", linewidth=0.25, transparency=1, legendon=True):
f, axs = plt.subplots(1, 2, figsize=(figsizew, figsizeh))
ax1, ax2 = axs
df.plot(column=att_col,cmap=colormap,scheme=scheme,k=k_value,edgecolor=edgecolor,linewidth=linewidth,alpha=transparency,legend=legendon,ax=ax1)
ax1.set_axis_off()
ax1.set_title(str(title))
df.plot(column=att_lag_col,cmap=colormap,scheme=scheme,k=k_value,edgecolor=edgecolor,linewidth=linewidth,alpha=transparency,legend=legendon,ax=ax2)
ax2.set_axis_off()
ax2.set_title(str(title) + " - Spatial Lag")
plt.show()
def standardize_values(df, w, att_col, att_lag_col, att_col_std, att_lag_col_std):
# Standardize for Moran
df[att_col_std] = df[att_col] - df[att_col].mean()
df[att_lag_col_std] = weights.lag_spatial(w, df[att_col_std])
return df
def plot_standardized_values(df, att_col_std, att_lag_col_std, title, figsizew=3, figsizeh=3, confidence=25, marker=".", pointcolor="blue", linecolor="red", label_frac=0.75, label_fontsize=10, label_weight="bold", label_color="red"):
f, ax = plt.subplots(1, figsize=(figsizew, figsizeh))
sns.regplot(x=att_col_std,y=att_lag_col_std,ci=confidence,data=df,marker=marker,scatter_kws={"color": pointcolor},line_kws={"color": linecolor})
ax.axvline(0, c="k", alpha=0.5)
ax.axhline(0, c="k", alpha=0.5)
ax.set_title(str(title) + " - Standardized Values")
x_min, x_max = ax.get_xlim()
y_min, y_max = ax.get_ylim()
frac = label_frac
x_right = frac * x_max
x_left = frac * x_min
y_top = frac * y_max
y_bot = frac * y_min
ax.text(x_right, y_top, "HH",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
ax.text(x_right, y_bot, "HL",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
ax.text(x_left, y_bot, "LL",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
ax.text(x_left, y_top, "LH",ha="center", va="center",fontsize=label_fontsize, fontweight=label_weight, color=label_color)
plt.show()
def handle_missing_values(df, w, att_col, att_col_std, att_lag_col_std, w_col, std_w_col):
# neighborhoods that have a *non-missing* average price
mask = df[att_col].notna()
# IDs that we want to keep in the weights object
valid_ids = df.index[mask].tolist()
# Subset the weights to only those ids
w_sub = w_subset(w, valid_ids)
# Row-standardize (good practice for Moran's I)
w_sub.transform = "R"
# y must be in the same order as w_sub.id_order
y = df.loc[w_sub.id_order, att_col].to_numpy()
# filter gdf with only polygons with prices
df_nonmissingpolygons = df.loc[w_sub.id_order].copy()
# 3. Spatial lags and standardized variable (only on df_nonmissingpolygons)
df_nonmissingpolygons[w_col] = weights.lag_spatial(w_sub, df_nonmissingpolygons[att_col])
df_nonmissingpolygons[att_col_std] = (df_nonmissingpolygons[att_col] - df_nonmissingpolygons[att_col].mean())
df_nonmissingpolygons[std_w_col] = weights.lag_spatial(w_sub, df_nonmissingpolygons[att_col_std])
return w_sub, y, df_nonmissingpolygons
def run_global_MoranI(df, w, att=None):
if att is None:
moran = esda.moran.Moran(df, w)
else:
moran = esda.moran.Moran(df[att], w)
print("Moran's I:", moran.I)
print("p-value:", moran.p_sim)
return moran
def plot_global_MoranI(moran):
plot_moran(moran)
def run_LISA(df, w, att=None, mask=None, siglevel=0.05):
if att is None:
lisa = esda.moran.Moran_Local(df, w)
if mask is not None:
df_output = mask.copy()
else:
lisa = esda.moran.Moran_Local(df[att], w)
df_output = df.copy()
df_output["I"] = lisa.Is
df_output["Q"] = lisa.q
quad_map = {1: "High-High",2: "Low-High",3: "Low-Low",4: "High-Low",}
df_output["Quad"] = pd.Series(lisa.q, index=df_output.index).map(quad_map)
df_output["p_value"] = lisa.p_sim
df_output["significance_level"] = df_output["p_value"] < siglevel
df_output["cluster"] = np.where(df_output["significance_level"],df_output["Quad"],"Not significant",)
return lisa, df_output
def plot_LISA(lisa, df, w, mask=None, siglevel=0.05, figsizew=10, figsizeh=5, sigcolor="green"):
gdf_to_plot = mask if mask is not None else df
quad_map = {1: "HighHigh", 2: "LowHigh", 3: "LowLow", 4: "HighLow"}
quad_labels = pd.Series(lisa.q, index=gdf_to_plot.index).map(quad_map)
order = ["HighHigh", "LowLow", "HighLow", "LowHigh"]
all_counts = quad_labels.value_counts().reindex(order)
sig_mask = lisa.p_sim < siglevel
quad_labels_sig = quad_labels[sig_mask]
sig_counts = quad_labels_sig.value_counts().reindex(order)
sig_string = "Statistically Significant (α = " + str(siglevel) + ")"
quad_df = pd.DataFrame({"All Observations": all_counts, sig_string: sig_counts}).fillna(0)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(figsizew, figsizeh))
# Rugplot
sns.kdeplot(lisa.Is, ax=ax1, label="Density", color="grey")
sns.rugplot(lisa.Is, ax=ax1, label="LISA Observations (Std Dev)", color="red", height=0.1)
ax1.set_title("Local Moran's I")
ax1.legend()
# Histogram
quad_df.plot(kind="bar", color=["grey", sigcolor], ax=ax2)
ax2.set_title(f"LISA quadrant counts (α = {siglevel})")
ax2.set_xlabel("Observation Value & Neighbor Likeness")
ax2.set_ylabel("Count")
plt.tight_layout()
plt.show()
def map_LISA(lisa,df,w,mask=None,siglevel=0.05,localstats=True,quad_categories=True,significance=True,clustermap=True,figsze=6,
stats_cmap="cividis", stats_scheme="quantiles", stats_edgecolor="grey", stats_linewidth=0.25, stats_transparency=1, stats_legend=True,
quad_c1="#306844", quad_c2="#98FBCB", quad_c3="#ff8503", quad_c4="#FFEE8C", quad_edgecolor="grey", quad_linewidth=0.25, quad_legend="upper right",
sig_cmap="BrBG",sig_linewidth=0.25,sig_edgecolor="white",sig_transparency=0.55,sig_legend=True,
clust_c1="#306844", clust_c2="#98FBCB", clust_c3="#ff8503", clust_c4="#FFEE8C", clust_c5="#d9d9d9", clust_edgecolor="grey", clust_linewidth=0.25, clust_legend="upper right"):
gdf_to_plot = mask if mask is not None else df
panels = []
if localstats:
panels.append("localstats")
if quad_categories:
panels.append("quadrant")
if significance:
panels.append("significance")
if clustermap:
panels.append("cluster")
n = len(panels)
if n == 1:
nrows, ncols = 1, 1
elif n == 2:
nrows, ncols = 1, 2
else:
nrows, ncols = 2, 2
figsize = (figsze * ncols, figsze * nrows)
f, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
axs = np.atleast_1d(axs).ravel()
quad_colors = {"High-High": quad_c1, "High-Low": quad_c2, "Low-Low": quad_c3, "Low-High": quad_c4}
cluster_colors = {"Not significant": clust_c5, "High-High": clust_c1, "High-Low": clust_c2, "Low-Low": clust_c3, "Low-High": clust_c4}
for ax, panel in zip(axs, panels):
if panel == "localstats":
gdf_to_plot.assign(Is=lisa.Is).plot(column="Is",cmap=stats_cmap,scheme=stats_scheme,k=5,edgecolor=stats_edgecolor,linewidth=stats_linewidth,alpha=stats_transparency,legend=stats_legend,ax=ax)
ax.set_title("Local Statistics", y=1)
elif panel == "quadrant":
legend_handles = []
for label, color in quad_colors.items():
sel = gdf_to_plot["Quad"] == label
if sel.any():
gdf_to_plot.loc[sel].plot(ax=ax,color=color,edgecolor=quad_edgecolor,linewidth=quad_linewidth)
# Make an explicit legend patch for this label
legend_handles.append(mpatches.Patch(color=color, label=label))
ax.set_title("LISA Quadrants", y=1)
if legend_handles:
ax.legend(handles=legend_handles,title="Quadrant",loc=quad_legend,fontsize=6,)
elif panel == "significance":
labels = (pd.Series(1 * (lisa.p_sim < siglevel), index=gdf_to_plot.index).map({1: "Significant", 0: "Non-Significant"}))
gdf_to_plot.assign(cl=labels).plot(column="cl",categorical=True,k=2,cmap=sig_cmap,linewidth=sig_linewidth,edgecolor=sig_edgecolor,alpha=sig_transparency,legend=True,ax=ax)
ax.set_title(f"Statistical Significance (α = {siglevel})", y=1)
elif panel == "cluster":
legend_handles = []
for label, color in cluster_colors.items():
sel = gdf_to_plot["cluster"] == label
if sel.any():
gdf_to_plot.loc[sel].plot(ax=ax,color=color,edgecolor=clust_edgecolor,linewidth=clust_linewidth)
legend_handles.append(mpatches.Patch(color=color, label=label))
ax.set_title(f"Moran Cluster Map (α = {siglevel})", y=1)
if legend_handles:
ax.legend(handles=legend_handles,title="Cluster",loc=clust_legend,fontsize=6,
)
ax.set_axis_off()
f.tight_layout()
plt.show()
Airbnb Autocorrelation Analysis¶
Average Airbnb Prices per neighborhood¶
Describe the attribute we are analyzing
We used a K-nearest neighbors spatial weights matrix with k = 5. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb", "avg_price_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 5
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.326, p = 0.001 k= 4 -> Moran's I = 0.329, p = 0.001 k= 5 -> Moran's I = 0.333, p = 0.001 k= 6 -> Moran's I = 0.326, p = 0.001 k= 7 -> Moran's I = 0.319, p = 0.001 k= 8 -> Moran's I = 0.317, p = 0.001 k= 9 -> Moran's I = 0.308, p = 0.001 k=10 -> Moran's I = 0.307, p = 0.001 k=11 -> Moran's I = 0.298, p = 0.001 k=12 -> Moran's I = 0.293, p = 0.001
Avg Airbnb Prices - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Avg Airbnb Prices", colormap="YlGn")
Observe what we see in the plot
Avg Airbnb Prices - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Avg Airbnb Prices", pointcolor="green")
Observe what we see in the plot
Avg Airbnb Prices - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.3326235709944917 p-value: 0.001
Observe what we see in the plot
Avg Airbnb Prices - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
Observe what we see in the plot
map_LISA(lisa, y, w_sub, mask=cluster_df)
Observe what we see in the plot
Airbnb prices for private listings¶
Describe the attribute we are analyzing
We used a K-nearest neighbors spatial weights matrix with k = 4. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 4 and declining gradually for larger k.
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb_entirespace", "avg_price_airbnb_entirespace_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 4
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
('WARNING: ', 294, ' is an island (no neighbors)')
('WARNING: ', 353, ' is an island (no neighbors)')
k= 3 -> Moran's I = 0.347, p = 0.001
k= 4 -> Moran's I = 0.355, p = 0.001
k= 5 -> Moran's I = 0.333, p = 0.001
k= 6 -> Moran's I = 0.312, p = 0.001
k= 7 -> Moran's I = 0.304, p = 0.001
k= 8 -> Moran's I = 0.302, p = 0.001
k= 9 -> Moran's I = 0.297, p = 0.001
k=10 -> Moran's I = 0.287, p = 0.001
k=11 -> Moran's I = 0.282, p = 0.001
k=12 -> Moran's I = 0.274, p = 0.001
Avg Private Airbnb Prices - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Avg Private Airbnb Prices", colormap="YlGn")
Observe what we see in the plot
Avg Private Airbnb Prices - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Avg Private Airbnb Prices", pointcolor="green")
Observe what we see in the plot
Avg Private Airbnb Prices - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.35490103888586577 p-value: 0.001
Observe what we see in the plot
Avg Private Airbnb Prices - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_entirespace_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
Observe what we see in the plot
map_LISA(lisa, y, w_sub, mask=cluster_df)
Observe what we see in the plot
Airbnb prices during high-season¶
Describe the attribute we are analyzing
We used a K-nearest neighbors spatial weights matrix with k = 5. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb_entirespace_highseason", "avg_price_airbnb_entirespace_highseason_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 5
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
('WARNING: ', 232, ' is an island (no neighbors)')
('WARNING: ', 236, ' is an island (no neighbors)')
('WARNING: ', 294, ' is an island (no neighbors)')
('WARNING: ', 422, ' is an island (no neighbors)')
k= 3 -> Moran's I = 0.298, p = 0.001
('WARNING: ', 422, ' is an island (no neighbors)')
k= 4 -> Moran's I = 0.309, p = 0.001
k= 5 -> Moran's I = 0.285, p = 0.001
k= 6 -> Moran's I = 0.285, p = 0.001
k= 7 -> Moran's I = 0.294, p = 0.001
k= 8 -> Moran's I = 0.289, p = 0.001
k= 9 -> Moran's I = 0.278, p = 0.001
k=10 -> Moran's I = 0.265, p = 0.001
k=11 -> Moran's I = 0.267, p = 0.001
k=12 -> Moran's I = 0.264, p = 0.001
Avg Airbnb Prices during High-season - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, " Avg Airbnb Prices during High-season", colormap="YlGn")
Observe what we see in the plot
Avg Airbnb Prices during High-season - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, " Avg Airbnb Prices during High-season", pointcolor="green")
Observe what we see in the plot
Avg Airbnb Prices during High-season - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.28498650491364647 p-value: 0.001
Observe what we see in the plot
Avg Airbnb Prices during High-season - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_entirespace_highseason_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
Observe what we see in the plot
map_LISA(lisa, y, w_sub, mask=cluster_df)
Observe what we see in the plot
Airbnb Prices during Low-season¶
Describe the attribute we are analyzing
We used a K-nearest neighbors spatial weights matrix with k = 17. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–25 produced similar clustering patterns, with global Moran’s I peaking at k = 17 and declining gradually for larger k.
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "avg_price_airbnb_entirespace_lowseason", "avg_price_airbnb_entirespace_lowseason_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 17
set_k_value(gdf_copy, att_col, 15, 20)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
('WARNING: ', 293, ' is an island (no neighbors)')
k=15 -> Moran's I = 0.164, p = 0.001
('WARNING: ', 293, ' is an island (no neighbors)')
k=16 -> Moran's I = 0.168, p = 0.001
k=17 -> Moran's I = 0.176, p = 0.001
k=18 -> Moran's I = 0.172, p = 0.001
k=19 -> Moran's I = 0.169, p = 0.001
k=20 -> Moran's I = 0.159, p = 0.001
Avg Airbnb Prices during Low-season - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Avg Airbnb Prices during Low Season", colormap="YlGn")
Observe what we see in the plot
Avg Airbnb Prices during Low-season - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Avg Airbnb Prices during Low Season", pointcolor="green")
Observe what we see in the plot
Airbnb Prices during Low-season - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
w_sub, y, gdf_masked = handle_missing_values(gdf, weight_matrix, att_col, att_col_std, att_lag_col_std, w_col, std_w_col)
moran = run_global_MoranI(y, w_sub)
plot_global_MoranI(moran)
Moran's I: 0.17589438808237565 p-value: 0.001
Observe what we see in the plot
Avg Airbnb Prices during Low-season - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(y, w_sub, att=None, mask=gdf_masked)
airbnb_prices_entirespace_lowseason_clusterdf = cluster_df.copy()
plot_LISA(lisa, y, w_sub, mask=gdf_masked)
Observe what we see in the plot
map_LISA(lisa, y, w_sub, mask=cluster_df)
Observe what we see in the plot
Airbnb Clusters¶
Describe the attribute we are analyzing
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_airbnbs", "count_airbnbs_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 4
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.419, p = 0.001 k= 4 -> Moran's I = 0.422, p = 0.001 k= 5 -> Moran's I = 0.413, p = 0.001 k= 6 -> Moran's I = 0.408, p = 0.001 k= 7 -> Moran's I = 0.405, p = 0.001 k= 8 -> Moran's I = 0.395, p = 0.001 k= 9 -> Moran's I = 0.387, p = 0.001 k=10 -> Moran's I = 0.381, p = 0.001 k=11 -> Moran's I = 0.375, p = 0.001 k=12 -> Moran's I = 0.367, p = 0.001
We used a K-nearest neighbors spatial weights matrix with k = 4. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 4 and declining gradually for larger k.
Airbnb Count per neighborhood - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Number of Airbnbs per neighborhood")
Observe what we see in the plot
Airbnb Count per neighborhood - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Number of Airbnbs per neighborhood")
Observe what we see in the plot
Airbnb Count per neighborhood - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_airbnbs")
plot_global_MoranI(moran)
Moran's I: 0.42245557023524244 p-value: 0.001
Observe what we see in the plot
Airbnb Count per neighborhood - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_airbnbs")
count_airbnbs_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="pink")
Observe what we see in the plot
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="PuRd", sig_cmap="tab10", sig_transparency=0.6, quad_c1="#B22222", quad_c2="#F8C8DC", quad_c3="#3D426B", quad_c4="#7894B0", clust_c1="#B22222", clust_c2="#F8C8DC", clust_c3="#3D426B", clust_c4="#7894B0")
Observe what we see in the plot
Tourism Autocorrelation Analysis¶
Tourism POI Clusters¶
Describe the attribute we are analyzing
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_pois", "count_pois_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 10
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.120, p = 0.004 k= 4 -> Moran's I = 0.150, p = 0.001 k= 5 -> Moran's I = 0.144, p = 0.001 k= 6 -> Moran's I = 0.138, p = 0.001 k= 7 -> Moran's I = 0.151, p = 0.001 k= 8 -> Moran's I = 0.139, p = 0.001 k= 9 -> Moran's I = 0.147, p = 0.001 k=10 -> Moran's I = 0.155, p = 0.001 k=11 -> Moran's I = 0.147, p = 0.001 k=12 -> Moran's I = 0.144, p = 0.001
We used a K-nearest neighbors spatial weights matrix with k = 10. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 10 and declining gradually for larger k.
Tourism POI count per neighborhood - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Tourism POIs per neighborhood")
Observe what we see in the plot
Tourism POI count per neighborhood - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Tourism POIs per neighborhood", figsizew=6, figsizeh=3, pointcolor="purple", linecolor="cyan", label_color="blue")
Observe what we see in the plot
Tourism POI count per neighborhood - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_pois")
plot_global_MoranI(moran)
Moran's I: 0.15470330400512608 p-value: 0.001
Observe what we see in the plot
Tourism POI count per neighborhood - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_pois")
count_pois_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="purple")
Observe what we see in the plot
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="cool", sig_cmap="PuOr", quad_c1="#BF40BF", quad_c2="#bfbcfc", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#BE00FE", clust_c2="#bfbcfc", clust_c3="#010057", clust_c4="#c1d8f0")
Observe what we see in the plot
Museum Clusters¶
Describe the attribute we are analyzing
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_museums", "count_museums_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 7
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.173, p = 0.003 k= 4 -> Moran's I = 0.238, p = 0.001 k= 5 -> Moran's I = 0.248, p = 0.001 k= 6 -> Moran's I = 0.257, p = 0.001 k= 7 -> Moran's I = 0.281, p = 0.001 k= 8 -> Moran's I = 0.271, p = 0.001 k= 9 -> Moran's I = 0.267, p = 0.001 k=10 -> Moran's I = 0.270, p = 0.001 k=11 -> Moran's I = 0.261, p = 0.001 k=12 -> Moran's I = 0.257, p = 0.001
We used a K-nearest neighbors spatial weights matrix with k = 7. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.
Museum count per neighborhood - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Museum Count per neighborhood")
Observe what we see in the plot
Museum count per neighborhood - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Museum Count per neighborhood", figsizeh=3, figsizew=6, pointcolor="purple", marker="^", linecolor="cyan", label_color="blue")
Observe what we see in the plot
Museum count per neighborhood - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_museums")
plot_global_MoranI(moran)
Moran's I: 0.2814197362236021 p-value: 0.001
Observe what we see in the plot
Museum count per neighborhood - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_museums")
count_museums_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="purple")
Observe what we see in the plot
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="cool", sig_cmap="PuOr", quad_c1="#BF40BF", quad_c2="#bfbcfc", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#BE00FE", clust_c2="#bfbcfc", clust_c3="#010057", clust_c4="#c1d8f0")
Observe what we see in the plot
Art Gallery Clusters¶
Describe the attribute we are analyzing
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_galleries", "count_galleries_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 3
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.443, p = 0.001 k= 4 -> Moran's I = 0.373, p = 0.001 k= 5 -> Moran's I = 0.352, p = 0.001 k= 6 -> Moran's I = 0.348, p = 0.001 k= 7 -> Moran's I = 0.335, p = 0.001 k= 8 -> Moran's I = 0.302, p = 0.001 k= 9 -> Moran's I = 0.274, p = 0.001 k=10 -> Moran's I = 0.266, p = 0.001 k=11 -> Moran's I = 0.252, p = 0.001 k=12 -> Moran's I = 0.247, p = 0.001
We used a K-nearest neighbors spatial weights matrix with k = 3. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 12 and declining gradually for larger k.
Art Gallery count per neighborhood - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Art Galleries per neighborhood")
Observe what we see in the plot
Art Gallery count per neighborhood - Standardized Spatial Lag¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Art Galleries per neighborhood", figsizeh=3, figsizew=6, pointcolor="purple", marker="D", linecolor="cyan", label_color="blue")
Observe what we see in the plot
Art Gallery count per neighborhood - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_galleries")
plot_global_MoranI(moran)
Moran's I: 0.4434317494717987 p-value: 0.001
Observe what we see in the plot
Art Gallery count per neighborhood - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_galleries")
count_galleries_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="purple")
Observe what we see in the plot
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="cool", sig_cmap="PuOr", quad_c1="#BF40BF", quad_c2="#bfbcfc", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#BE00FE", clust_c2="#bfbcfc", clust_c3="#010057", clust_c4="#c1d8f0")
Observe what we see in the plot
Monuments Clusters¶
Describe the attribute we are analyzing
# Analysis Parameters
gdf_copy = neighborhood_stats.copy()
att_col, att_lag_col = "count_monuments", "count_monumentss_lag"
att_col_std, att_lag_col_std = (str(att_col) + "_std"), (str(att_lag_col) + "_std")
w_col, std_w_col = ("w_" + str(att_col)), ("w_" + str(att_col_std))
k_value = 5
set_k_value(gdf_copy, att_col, 3, 12)
# Weighted Matrix
gdf, weight_matrix = prep_data(gdf_copy, k_value)
k= 3 -> Moran's I = 0.630, p = 0.001 k= 4 -> Moran's I = 0.631, p = 0.001 k= 5 -> Moran's I = 0.652, p = 0.001 k= 6 -> Moran's I = 0.631, p = 0.001 k= 7 -> Moran's I = 0.647, p = 0.001 k= 8 -> Moran's I = 0.621, p = 0.001 k= 9 -> Moran's I = 0.612, p = 0.001 k=10 -> Moran's I = 0.610, p = 0.001 k=11 -> Moran's I = 0.593, p = 0.001 k=12 -> Moran's I = 0.592, p = 0.001
We used a K-nearest neighbors spatial weights matrix with k = 5. This value was chosen as the smallest k for which all non-missing neighborhoods retained at least one valid neighbor (no “islands”) after masking missing values, while keeping neighbor distances within a neighborhood scale. Sensitivity checks with k = 3–12 produced similar clustering patterns, with global Moran’s I peaking at k = 5 and declining gradually for larger k.
Monument count per neighborhood - Spatial Lag¶
Describe spatial lag
# Spatial Lag
gdf = spatial_lag(gdf, att_col, att_lag_col, weight_matrix, k_value)
plot_spatial_lag(gdf, att_col, att_lag_col, k_value, "Historic Monuments per Neighborhood", colormap="YlOrBr")
Observe what we see in the plot
Monument count per neighborhood - Standardized Data¶
Explain why we standardize (normalize) and how
# Standardize
gdf = standardize_values(gdf, weight_matrix, att_col, att_lag_col, att_col_std, att_lag_col_std)
plot_standardized_values(gdf, att_col_std, att_lag_col_std, "Historic Monuments per Neighborhood", figsizeh=3, figsizew=5, pointcolor="orange", marker="p", linecolor="green", label_color="brown")
Observe what we see in the plot
Monument count per neighborhood - Global Moran's I¶
Describe what global autocorrelation is and what Moran's I is computing
# Global Moran's I
moran = run_global_MoranI(gdf, weight_matrix, att="count_monuments")
plot_global_MoranI(moran)
Moran's I: 0.6518726591100805 p-value: 0.001
Observe what we see in the plot
Monument count per neighborhood - Local Indicators of Spatial Association (LISA)¶
# LISA
lisa, cluster_df = run_LISA(gdf, weight_matrix, att="count_monuments")
count_monuments_clusterdf = cluster_df.copy()
plot_LISA(lisa, cluster_df, weight_matrix, sigcolor="orange")
Observe what we see in the plot
map_LISA(lisa, cluster_df, weight_matrix, stats_cmap="viridis", sig_cmap="Paired", quad_c1="#ff4d01", quad_c2="#F9AD6f", quad_c3="#010057", quad_c4="#c1d8f0", clust_c1="#ff4d01", clust_c2="#F9AD6f", clust_c3="#010057", clust_c4="#c1d8f0")
Observe what we see in the plot
Concluding Analysis¶
Heatmaps¶
# Add cluster designation to each neighborhood polygon from the different datatsets
cluster_heatmap_gdf = neighborhood_stats.copy()
airbnb_price_cluster = (airbnb_prices_entirespace_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_airbnbprices"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(airbnb_price_cluster, on="CBS_Buurtcode", how="left")
airbnbs_cluster = (count_airbnbs_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_airbnbs"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(airbnbs_cluster, on="CBS_Buurtcode", how="left")
pois_cluster = (count_pois_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_pois"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(pois_cluster, on="CBS_Buurtcode", how="left")
museums_cluster = (count_museums_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_museums"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(museums_cluster, on="CBS_Buurtcode", how="left")
galleries_cluster = (count_galleries_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_galleries"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(galleries_cluster, on="CBS_Buurtcode", how="left")
monuments_cluster = (count_monuments_clusterdf[["CBS_Buurtcode", "cluster"]].rename(columns={"cluster": "cluster_monuments"}))
cluster_heatmap_gdf = cluster_heatmap_gdf.merge(monuments_cluster, on="CBS_Buurtcode", how="left")
# define function to map hotspots and colspots with contextly
def create_heatmap(title, attribute, hotspotcolor="#d7191c", coldspotcolor="#2b83ba", transparency=0.6):
hh_ll = cluster_heatmap_gdf[cluster_heatmap_gdf[attribute].isin(["High-High", "Low-Low"])].copy()
hh_ll = hh_ll.to_crs(epsg=3857)
fig, ax = plt.subplots(figsize=(8, 8))
hh_ll[hh_ll[attribute] == "High-High"].plot(ax=ax,color=hotspotcolor,edgecolor="white",linewidth=0.3,alpha=transparency)
hh_ll[hh_ll[attribute] == "Low-Low"].plot(ax=ax,color=coldspotcolor,edgecolor="white",linewidth=0.3,alpha=transparency)
contextily.add_basemap(ax,crs=hh_ll.crs,source=contextily.providers.CartoDB.Positron)
handles = [mpatches.Patch(color=hotspotcolor, label="High " + str(title) + " clusters"),mpatches.Patch(color=coldspotcolor, label="Low " + str(title) + " clusters")]
ax.legend(handles=handles, loc="lower left", fontsize=8, title="LISA clusters")
ax.set_axis_off()
plt.title("Hotspots and Coldspots of " + str(title) + " clusters over Amsterdam")
plt.tight_layout()
plt.show()
Airbnb Price hotspots and coldspots¶
create_heatmap("Airbnb Price", "cluster_airbnbprices")
Airbnb Location hotspots and coldspots¶
create_heatmap("Airbnb Locations", "cluster_airbnbs")
Tourism POI hotspots and coldspots¶
create_heatmap("Tourism POIs", "cluster_pois")
Museum hotspots and coldspots¶
create_heatmap("Museum", "cluster_museums")
Art Gallery hotspots and coldspots¶
create_heatmap("Art Gallery", "cluster_galleries")
Monuments hotspots and coldspots¶
create_heatmap("Monument", "cluster_monuments")
Intersections of Spatial Correlation¶
Airbnb price cluster intersection with any tourism-related cluster¶
# Identify Airbnb clusters that match any of the tourism clusters
tourism_cols = ["cluster_pois","cluster_museums","cluster_galleries","cluster_monuments"]
match_any = (cluster_heatmap_gdf[tourism_cols].eq(cluster_heatmap_gdf["cluster_airbnbprices"], axis=0).any(axis=1))
cluster_heatmap_gdf["cluster_airbnb_match_any_tourism"] = np.where(match_any,cluster_heatmap_gdf["cluster_airbnbprices"],np.nan)
match_gdf = cluster_heatmap_gdf.dropna(subset=["cluster_airbnb_match_any_tourism"]).copy()
match_gdf = match_gdf.to_crs(epsg=3857)
color_map = {"High-High": "#d7191c","Low-Low": "#2b83ba","High-Low": "#fdae61","Low-High": "#c1d8f0"}
fig, ax = plt.subplots(figsize=(8, 8))
handles = []
for label, color in color_map.items():
sel = match_gdf["cluster_airbnb_match_any_tourism"] == label
if sel.any():
match_gdf.loc[sel].plot(ax=ax,color=color,edgecolor="white",linewidth=0.3,alpha=0.6)
handles.append(mpatches.Patch(color=color, label=label))
contextily.add_basemap(ax,crs=match_gdf.crs,source=contextily.providers.CartoDB.Voyager)
if handles:
ax.legend(handles=handles,title="Airbnb cluster matching\na tourism cluster",loc="lower right",fontsize=8)
ax.set_axis_off()
plt.title("Where Airbnb LISA clusters match\nat least one tourism LISA cluster", fontsize=12)
plt.tight_layout()
plt.show()
# show details of matching airbnb and tourism clusters
show_match_gdf = match_gdf.loc[match_gdf['cluster_airbnbprices'] != "Not significant"]
show_match_gdf.loc[:, ['Buurtcode', 'Buurt', 'count_airbnbs', 'avg_price_airbnb_entirespace', 'count_pois', 'count_museums', 'count_galleries', 'count_monuments', 'cluster_airbnbprices', 'cluster_pois', 'cluster_museums', 'cluster_galleries', 'cluster_monuments']].sort_values(by='avg_price_airbnb_entirespace', ascending=False)
| Buurtcode | Buurt | count_airbnbs | avg_price_airbnb_entirespace | count_pois | count_museums | count_galleries | count_monuments | cluster_airbnbprices | cluster_pois | cluster_museums | cluster_galleries | cluster_monuments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 164 | AE04 | Nes e.o. | 15 | 431.750000 | 3 | 1 | 0 | 80 | High-High | High-High | High-High | Not significant | High-High |
| 5 | AD03 | Nieuwendijk-Noord | 25 | 380.200000 | 10 | 1 | 1 | 99 | High-High | High-High | High-High | Not significant | High-High |
| 9 | AD06 | Spuistraat-Zuid | 25 | 375.000000 | 6 | 1 | 1 | 145 | High-High | High-High | High-High | Not significant | High-High |
| 160 | AE01 | Kop Zeedijk | 21 | 373.333333 | 8 | 2 | 0 | 187 | High-High | High-High | High-High | Low-High | High-High |
| 10 | AD07 | Begijnhofbuurt | 12 | 352.666667 | 14 | 1 | 0 | 160 | High-High | High-High | High-High | Not significant | High-High |
| 163 | AE03 | Burgwallen-Oost | 54 | 343.363636 | 7 | 4 | 2 | 285 | High-High | High-High | High-High | Not significant | High-High |
| 446 | AB11 | Passeerdersgrachtbuurt | 22 | 340.857143 | 0 | 0 | 0 | 53 | High-High | Not significant | Low-High | Not significant | High-High |
| 414 | KN01 | Prinses Irenebuurt | 11 | 337.000000 | 5 | 0 | 0 | 5 | High-High | High-High | Not significant | Not significant | Not significant |
| 161 | AE02 | Oude Kerk e.o. | 34 | 332.500000 | 8 | 1 | 1 | 139 | High-High | High-High | High-High | High-High | High-High |
| 399 | KJ04 | Minervabuurt-Zuid | 7 | 329.500000 | 3 | 0 | 0 | 1 | High-High | High-High | Not significant | Not significant | Not significant |
| 7 | AD05 | Nieuwe Kerk e.o. | 35 | 305.923077 | 17 | 2 | 1 | 133 | High-High | High-High | High-High | High-High | High-High |
| 1 | AC03 | Felix Meritisbuurt | 63 | 302.411765 | 3 | 0 | 1 | 460 | High-High | High-High | Low-High | High-High | High-High |
| 173 | AF08 | Zuiderkerkbuurt | 34 | 302.100000 | 3 | 0 | 0 | 136 | High-High | High-High | Low-High | Not significant | High-High |
| 159 | EV02 | Vondelparkbuurt-Oost | 36 | 295.846154 | 8 | 0 | 0 | 20 | High-High | High-High | Low-High | Not significant | Not significant |
| 2 | AC04 | Leidsegracht-Noord | 16 | 295.200000 | 4 | 3 | 0 | 130 | High-High | Not significant | Not significant | Not significant | High-High |
| 167 | AF02 | Lastage | 31 | 287.692308 | 0 | 0 | 0 | 122 | High-High | Not significant | Low-High | Not significant | High-High |
| 185 | AH06 | Leidsebuurt-Zuidoost | 8 | 278.250000 | 8 | 2 | 0 | 7 | High-High | High-High | Not significant | Not significant | Low-High |
| 171 | AF06 | Uilenburg | 17 | 278.083333 | 2 | 0 | 0 | 32 | High-High | Low-High | Not significant | Not significant | High-High |
| 162 | AH02 | Leidsebuurt-Noordoost | 46 | 269.727273 | 5 | 1 | 2 | 109 | High-High | Not significant | High-High | Not significant | High-High |
| 266 | NQ10 | Durgerdam | 8 | 266.500000 | 3 | 0 | 0 | 69 | High-Low | High-Low | Not significant | Not significant | Not significant |
| 8 | AH01 | Leidsebuurt-Noordwest | 14 | 265.125000 | 1 | 0 | 0 | 29 | High-High | Not significant | Low-High | Not significant | High-High |
| 165 | AE05 | BG-terrein e.o. | 28 | 263.142857 | 16 | 2 | 0 | 150 | High-High | High-High | High-High | Not significant | High-High |
| 213 | NC02 | Tuindorp Oostzaan-Oost | 42 | 223.705882 | 1 | 0 | 0 | 8 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 224 | NF01 | Buiksloterdijk-West | 2 | 205.000000 | 0 | 0 | 0 | 10 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 327 | FN04 | Staalmanbuurt | 16 | 201.666667 | 1 | 0 | 0 | 0 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 212 | NC01 | Tuindorp Oostzaan-West | 6 | 198.000000 | 0 | 0 | 0 | 0 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 221 | NE04 | Banne-Zuidoost | 23 | 192.818182 | 0 | 0 | 0 | 3 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 223 | NE06 | Marjoleinterrein | 2 | 180.000000 | 1 | 0 | 0 | 0 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 244 | NK01 | Bloemenbuurt-Noord | 44 | 178.444444 | 1 | 0 | 0 | 0 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
| 109 | MJ03 | Rieteilanden-West | 27 | 172.285714 | 1 | 0 | 0 | 0 | Low-High | Low-Low | Not significant | Low-High | Not significant |
| 218 | NE01 | Banne-Noordwest | 7 | 142.666667 | 0 | 0 | 0 | 0 | Low-Low | Low-Low | Not significant | Not significant | Not significant |
Airbnb price cluster intersection with every tourism-related cluster¶
# Identify High-High clusters for all attributes
cols = ["cluster_airbnbprices","cluster_pois","cluster_museums","cluster_galleries","cluster_monuments"]
all_HH = (cluster_heatmap_gdf[cols] == "High-High").all(axis=1)
all_LL = (cluster_heatmap_gdf[cols] == "Low-Low").all(axis=1)
all_HL = (cluster_heatmap_gdf[cols] == "High-Low").all(axis=1)
all_LH = (cluster_heatmap_gdf[cols] == "Low-High").all(axis=1)
cluster_heatmap_gdf["cluster_all"] = np.nan
cluster_heatmap_gdf.loc[all_HH, "cluster_all"] = "High-High"
cluster_heatmap_gdf.loc[all_LL, "cluster_all"] = "Low-Low"
cluster_heatmap_gdf.loc[all_HL, "cluster_all"] = "High-Low"
cluster_heatmap_gdf.loc[all_LH, "cluster_all"] = "Low-High"
cluster_heatmap_gdf["cluster_all"].value_counts()
all5 = cluster_heatmap_gdf.dropna(subset=["cluster_all"]).copy()
all5 = all5.to_crs(epsg=3857)
color_map = {"High-High": "#d7191c","Low-Low":"#2b83ba","High-Low":"#fdae61","Low-High":"#abdda4"}
fig, ax = plt.subplots(figsize=(12, 6))
handles = []
for label, color in color_map.items():
sel = all5["cluster_all"] == label
if sel.any():
all5.loc[sel].plot(ax=ax,color=color,edgecolor="white",linewidth=0.3,alpha=0.6,)
handles.append(mpatches.Patch(color=color, label=label))
'''
xmin, ymin, xmax, ymax = all5.total_bounds
dx = (xmax - xmin) * 0.3
dy = (ymax - ymin) * 0.3
ax.set_xlim(xmin - dx, xmax + dx)
ax.set_ylim(ymin - dy, ymax + dy)
'''
contextily.add_basemap(ax,crs=all5.crs,source=contextily.providers.OpenStreetMap.Mapnik)
if handles:ax.legend(handles=handles,title="Highest Airbnb Prices and Tourism Draw",loc="lower left",fontsize=8,)
ax.set_axis_off()
plt.title("Neighborhoods with the same LISA cluster classification for all Tourism and Airbnb Clusters", fontsize=12)
plt.tight_layout()
plt.show()
# show details of most sought after neighborhoods
all5.loc[:, ['Buurtcode', 'Buurt', 'count_airbnbs', 'avg_price_airbnb_entirespace', 'count_pois', 'count_museums', 'count_galleries', 'count_monuments', 'Oppervlakte_m2']]
| Buurtcode | Buurt | count_airbnbs | avg_price_airbnb_entirespace | count_pois | count_museums | count_galleries | count_monuments | Oppervlakte_m2 | |
|---|---|---|---|---|---|---|---|---|---|
| 7 | AD05 | Nieuwe Kerk e.o. | 35 | 305.923077 | 17 | 2 | 1 | 133 | 83841 |
| 161 | AE02 | Oude Kerk e.o. | 34 | 332.500000 | 8 | 1 | 1 | 139 | 92282 |
As this is the only neihgborhood that is classified as a High-High LISA clusters for all airbnb price clusters, tourism POI clusters, museum clusters, art gallery clusters, and monument clusters; we can estimate that the Oude Kerk e.o. neighborhood is the most sought-after neighborhood for tourism in Amsterdam.
Plotting Airbnb Price vs Tourism¶
# plot airbnb price with tourism count per neighborhood
scatter_df = cluster_heatmap_gdf.copy()
scatter_df["tourism_total"] = scatter_df["count_pois"] + scatter_df["count_monuments"]
scatter_df = scatter_df.dropna(subset=["avg_price_airbnb_entirespace", "tourism_total", "cluster_airbnbprices"])
valid_clusters = ["High-High", "Low-Low", "High-Low", "Low-High"]
scatter_df = scatter_df[scatter_df["cluster_airbnbprices"].isin(valid_clusters)]
cluster_palette = {"High-High": "#d7191c","Low-Low": "#2b83ba","High-Low": "#fdae61","Low-High": "#c1d8f0"}
plt.figure(figsize=(8, 6))
sns.scatterplot(data=scatter_df,x="tourism_total",y="avg_price_airbnb_entirespace",hue="cluster_airbnbprices",palette=cluster_palette,alpha=0.8,edgecolor="white",linewidth=0.5)
sns.regplot(data=scatter_df,x="tourism_total",y="avg_price_airbnb_entirespace",scatter=False,ci=95,color="green")
plt.xlabel("Number of tourism sites")
plt.ylabel("Average Airbnb price")
plt.title("Avg Airbnb Price vs. Tourism Sites per Neighborhood")
plt.ylim(bottom=100)
plt.tight_layout()
plt.show()
Conclusion¶
The prices of airbnbs are clustered around three areas. First, airbnb prices go up with proximity to the historically- and culturally-rich Center-West canals. Second the clustering of higher airbnb prices certainly occurs around the cities' two major parks, VondelPark and Oosterpark, however this is not true for parks at the edge of the city limits like Rembrandtpark and Amsterdam Bos, which actually have a cluster of lower airbnb prices. Third, there is a clustering of higher airbnb prices near the major bar & clubbing area of Leidseplein.
We notice that the higher prices of airbnbs is more spatially clustered with tourism sites, whereas the LISA autocorrelation perofrmed on the clustering of airbnb locations at the beginning of this analysis showed that airbnb locations were concentrated on the South/SouthWest periphery of the tourism center. The commonality between the autocorrelation of airbnb prices and autocorrelation of airbnb locations, is both of their clustering near around Vondelpark.